---
title: "Control Variables"
output: html_notebook
---

First, I estimate the total population and rural population of Colombia. 

```{r}
library(tidyverse)
library(readstata13)
library(foreign)
library(readxl)
library(writexl)

rcs_data <- read_xlsx("TSCS-Colombia(1964-2005).xlsx") 
  # This is a template of a TSCS structure with municipality-year units of Colombia.

### 1985-1994

dane_8594 <- read_xlsx("DANE-PopulationMunicipality_AreaSexAge(1985-1994).xlsx")

dane_8594 <- dane_8594 %>%
  select(DP:"ÁREA GEOGRÁFICA", "Total General")

dane_8594 <- dane_8594 %>%
  rename(divipola = DPMP,
         year = AÑO,
         area = "ÁREA GEOGRÁFICA",
         total = "Total General")

dane_8594$divipola <- as.numeric(dane_8594$divipola)

dane_8594r <- dane_8594 %>%
  filter(area == "Centros Poblados y Rural Disperso")

dane_8594r <- dane_8594r %>%
  rename(rural = total) %>%
  select(divipola, year, rural)

dane_8594t <- dane_8594 %>%
  filter(area == "Total")

dane_8594t <- dane_8594t %>%
  select(divipola, year, total)

dane_8594f <- left_join(dane_8594t, dane_8594r)

dane_8594f <- dane_8594f %>%
  mutate(rurality = rural/total)

### 1995-2004

dane_9404 <- read_xlsx("DANE-PopulationMunicipality_AreaSexAge(1995-2004).xlsx")

dane_9404 <- dane_9404 %>%
  select(DP:"ÁREA GEOGRÁFICA", "Total General")

dane_9404 <- dane_9404 %>%
  rename(divipola = DPMP,
         year = AÑO,
         area = "ÁREA GEOGRÁFICA",
         total = "Total General")

dane_9404$divipola <- as.numeric(dane_9404$divipola)

dane_9404r <- dane_9404 %>%
  filter(area == "Centros Poblados y Rural Disperso")

dane_9404r <- dane_9404r %>%
  rename(rural = total) %>%
  select(divipola, year, rural)

dane_9404t <- dane_9404 %>%
  filter(area == "Total")

dane_9404t <- dane_9404t %>%
  select(divipola, year, total)

dane_9404f <- left_join(dane_9404t, dane_9404r)

dane_9404f <- dane_9404f %>%
  mutate(rurality = rural/total)

rm(dane_9404, dane_9404r, dane_9404t)

### 2005

dane_05 <- read_xlsx("DANE-PopulationMunicipality_AreaSexAge(2005-2017).xlsx")

dane_05 <- dane_05 %>%
  select(DP:"ÁREA GEOGRÁFICA", "Total General")

dane_05 <- dane_05 %>%
  rename(divipola = DPMP,
         year = AÑO,
         area = "ÁREA GEOGRÁFICA",
         total = "Total General")

dane_05 <- dane_05 %>%
  filter(year == 2005)

dane_05$divipola <- as.numeric(dane_05$divipola)

dane_05r <- dane_05 %>%
  filter(area == "Centros Poblados y Rural Disperso")

dane_05r <- dane_05r %>%
  rename(rural = total) %>%
  select(divipola, year, rural)

dane_05t <- dane_05 %>%
  filter(area == "Total")

dane_05t <- dane_05t %>%
  select(divipola, year, total)

dane_05f <- left_join(dane_05t, dane_05r)

dane_05f <- dane_05f %>%
  mutate(rurality = rural/total)

rm(dane_05, dane_05r, dane_05t)

dane_8505 <- bind_rows(dane_8594f, dane_9404f, dane_05f)

write_xlsx(dane_8505, "Ortega(2024)-DANE-Population(Total&Rural)_19852005.xlsx") # You can omit this step.
  # Checked 09/28/2024
```

Merge the two datasets:

```{r}
pop <- dane_8505 %>%
  rename(updated_pop = total)

cv <- left_join(rcs_data, pop)

cv <- cv %>%
  mutate(lpop = log(updated_pop)) %>%
  relocate(divipola, year, lpop, rurality)

```

Shared vote of the left in local council elections:

```{r}
left <- read_xlsx("Ortega(2024)-ElectionsCouncils&LeftRepresentation(1984-2003).xlsx")
  # See the folder 'CEDAE-PreProcessing' for more information on how this dataset was prepared.

col_tscs <- read.dta13("TSCS_Colombia(1958-2010)-V(12072021).dta")
  # This is a template of a TSCS structure with municipality-year units of Colombia.

col_tscs <- col_tscs %>%
  filter(year > 1983 & year < 2006) %>%
  rename(divipola = codmpio) %>%
  dplyr::select(year, divipola)

left2 <- left_join(col_tscs, left)

left2 <- left2 %>%
  mutate(c_lsh_m = (t_left_m/total_v)*100,
         c_lsh_c = (t_left_core/total_v)*100,
         c_lsh_b = (t_left_broad/total_v)*100,
         c_lsh_e = (t_left_extended/total_v)*100)

left2 <- left2 %>%
  dplyr::group_by(year) %>%
  mutate(median_c_lsh_m = median(c_lsh_m, na.rm = T),
         median_c_lsh_c = median(c_lsh_c, na.rm = T),
         median_c_lsh_b = median(c_lsh_b, na.rm = T),
         median_c_lsh_e = median(c_lsh_e, na.rm = T),) %>%
  dplyr::ungroup()

left2 <- left2 %>%
  dplyr::group_by(divipola) %>%
  fill(total_v, t_left_m, t_left_core, t_left_broad, t_left_extended,
       c_lsh_m, c_lsh_c, c_lsh_b, c_lsh_e,
       median_c_lsh_m, median_c_lsh_c, median_c_lsh_b, median_c_lsh_e,
       .direction = "down") %>% 
  dplyr::ungroup()

left2 <- left2 %>%
  drop_na(total_v)

left2 <- left2 %>%
  rename(cv3c.c_lsh1 = c_lsh_m, # Manual
         cv3c.c_lsh2 = c_lsh_c, # Core
         cv3c.c_lsh3 = c_lsh_b, # Broad
         cv3c.c_lsh4 = c_lsh_e # Extended
         ) 

left2 <- left2 %>%
  dplyr::select(year, divipola, cv3c.c_lsh1:median_c_lsh_e)

cv <- left_join(cv, left2)

```

State Capacity

```{r}
st_capac <- read.dta13("CEDE_CaracteristicasGenerales.dta")
  # This dataset starts in 1993. I will complete the data on NBI with the 1985 Census.

st_capac$year <- year(st_capac$ano)

st_capac <- st_capac %>%
  rename(divipola = codmpio) %>%
  filter(year < 2006) %>%
  dplyr::select(divipola, year, nbi)

st_capac <- st_capac %>%
  dplyr::group_by(divipola) %>%
  fill(nbi, 
       .direction = "downup") %>% 
  dplyr::ungroup()

# Now, estimate the NBI for 1985
## There are five components to estimate the NBI, I found information on the first three.

census_v85 <- read.dta13("VIVIENDAS.dta")

  ## Componente: Vivienda Inadecuada
census_v85 <- census_v85 %>%
  mutate(
    nbi1.1 = case_when( # Tipo de vivienda
      V11_TIPV == "Vivienda movil" | V11_TIPV == "Vivienda no habitable" ~ 1,
    TRUE ~ 0),
    nbi1.2 = case_when( # Tipo de paredes
      V13_MATP == "Sin paredes" | V13_MATP == "Tela o desecho" ~ 1,
    TRUE ~ 0),
    nbi1.3 = case_when( # Tipo de piso
      V14_MATP == "Tierra" ~ 1,
      TRUE ~ 0)) %>%
  relocate(I01_DPTO, I03_MPIO, I05_SECT, nbi1.1:nbi1.3)

  ## Componente: Servicios Inadecuados

census_v85 <- census_v85 %>%
  mutate(
    nbi2.1 = if_else(V18_SERS == "No Tiene", 1, 0),
    nbi2.2 = case_when(
      V17_ACUE == 4 & 
        (V24_AGUA == "Quebrada" | V24_AGUA == "Carro tanque" 
         | V24_AGUA == "Agua lluvia") ~ 1,
      TRUE ~ 0)) %>%
  relocate(I01_DPTO:nbi1.3, nbi2.1, nbi2.2)

  ## Componente: Hacinamiento

census_v85 <- census_v85 %>%
  mutate(hac = V25_TOPP/V15_NROC,
         nbi3.1 = if_else(hac > 3, 1, 0)) %>%
  relocate(I01_DPTO:nbi2.2, nbi3.1)

  ## Componente: Inasistencia Escolar. No data on this component.

  ## Componente: Dependencia Económica. No data on this component.

census_v85 <- census_v85 %>%
  rowwise() %>% 
  mutate(total = sum(c_across(nbi1.1:nbi3.1), na.rm = T))

census_v85 <- census_v85 %>%
  mutate(nbi = if_else(total > 0, 1, 0))

dat_cv3 <- census_v85 %>%
  group_by(I01_DPTO, I03_MPIO) %>%
  summarise(pop_c = n())
  
x1 <- census_v85 %>%
  filter(nbi == 1) %>%
  group_by(I01_DPTO, I03_MPIO) %>%
  summarise(nbi_pop = n())

dat_nbi <- left_join(dat_cv3, x1, by=c("I01_DPTO", "I03_MPIO"))

dat_nbi <- dat_nbi %>% replace(is.na(.), 0)

dat_nbi <- dat_nbi %>%
  mutate(nbi = (nbi_pop/pop_c)*100)

  # Fix divipola

cede <- read.dta13("TSCS_Colombia(1958-2010)-V(12072021).dta")

cede <- cede %>%
  filter(year == 1985)

cede$divipola_ch <- as.character(cede$codmpio)

cede$codmpio_ch = str_sub(cede$divipola_ch, - 3, - 1) 

cede$I03_MPIO <- as.numeric(cede$codmpio_ch)

cede <- cede %>%
  rename(I01_DPTO = coddepto)

dat_nbi <- left_join(dat_nbi, cede, by=c("I01_DPTO", "I03_MPIO"))

dat_nbi_f <- dat_nbi %>%
  select(codmpio, municipio, nbi_pop, nbi) %>%
  rename(divipola = codmpio)

dat_nbi_f <- dat_nbi_f %>%
  drop_na(divipola) %>%
  select(-I01_DPTO)

write_xlsx(dat_nbi_f, "Ortega(2024)-DANE-Census1985_NBI(06202022).xlsx")
  # Checked

# Now, I will merge the two parts of the dataset.

NBI <- dat_nbi_f %>%
  mutate(year = 1985) %>%
  select(divipola, year, nbi)

st_capac_f <- bind_rows(NBI, st_capac)

st_capac_f <- st_capac_f %>%
  select(-I01_DPTO)

### Now merge with the master dataset of CVs

cv <- left_join(cv, st_capac_f)

cv <- cv %>%
  dplyr::group_by(divipola) %>%
  fill(nbi, 
       .direction = "down") %>% 
  dplyr::ungroup()

cv <- cv %>%
  select(divipola:cv3c.c_lsh4, nbi, median_c_lsh_m:median_c_lsh_e)

cv <- cv %>%
  rename(cv1.lpop = lpop, # Logged Population
         cv2.rur_p = rurality, # Percentage Rurality
         cv4.nbi = nbi) # NBI

write_xlsx(cv, "Ortega(2022)-CV_ToComplete(06202022).xlsx")
  # You can omit this step.
  # Checked

```

Updated CV: Military Variables

```{r}

cv2 <- cv %>%
  select(divipola, year, updated_pop)

mil <- read_csv("Ortega(2024)-OMC-TSCS_CasosFinal-Updated(09142024).csv")
  # See the folder 'OMC-PreProcessing-Updated' for more information on how this data was prepared.

mil <- mil %>%
  rename(divipola = divipola_alt)

mil$divipola <- as.numeric(mil$divipola) # Transform a numeric into a character

mil2 <- left_join(cv2, mil, by=c("divipola", "year"))

mil2 <- mil2 %>%
  mutate_at(vars(conf_events:cl_rg_pm_2b)
            , ~replace_na(., 0))

mil2 <- mil2 %>%
  group_by(divipola, year) %>%
  mutate(t_sf_ua_up1 = sf_air_attacks + sf_mil_operations +
           sf_ambushes + sf_hitnruns + sf_raids,
         t_pm_ua_up1 = pm_tatt_mil_targets + pm_ambushes + pm_hitnruns +
           pm_raids,
         t_rg_ua_up1 =  rg_tatt_mil_targets + rg_ambushes + rg_hitnruns 
         + rg_raids,
         bd_st_rg_omc_up1.2 = ((cl_st_rg_2 + t_sf_ua_up1)/(cl_st_rg_2 + t_sf_ua_up1 + t_rg_ua_up1))) %>% # I
  dplyr::select(divipola, year, bd_st_rg_omc_up1.2,
                t_sf_ua_up1, t_pm_ua_up1, t_rg_ua_up1,
                conf_events, sf_assassinations, pm_assassinations, sf_massacres, pm_massacres)

mil2 <- mil2 %>%
  rename(
    conf_events_up1 = conf_events, 
    sf_assassinations_up1 = sf_assassinations, 
    pm_assassinations_up1 = pm_assassinations, 
    sf_massacres_up1 = sf_massacres, 
    pm_massacres_up1 = pm_massacres)

mil_cv <- left_join(cv2, mil2)

# CV6.1a - OMC Total Conflict Events
mil_cv <- mil_cv %>%
  mutate(conf_events_omc_r_up1 = (conf_events_up1/updated_pop)*100000) 

# CV7.1a - OMC Civilian Deaths by PGF
mil_cv <- mil_cv %>%
  mutate(pgf_ckill_up1 = sf_assassinations_up1 + pm_assassinations_up1,
         pgf_cmass_up1 = sf_massacres_up1 + pm_massacres_up1,
         pgf_dv_up1 = pgf_ckill_up1 + pgf_cmass_up1,
         pgf_ckill_omc_r_up1 =  (pgf_ckill_up1/updated_pop)*100000,
         pgf_cmass_omc_r_up1 =  (pgf_cmass_up1/updated_pop)*100000,
         pgf_dv_omc_r_up1 =  (pgf_dv_up1/updated_pop)*100000)

mil_cv <- mil_cv %>%
  relocate(divipola, year, 
           bd_st_rg_omc_up1.2, conf_events_omc_r_up1, pgf_ckill_omc_r_up1, pgf_cmass_omc_r_up1, pgf_dv_omc_r_up1)

write_xlsx(mil_cv, "Ortega(2024)-CV-MilitaryVariables_Updated(09282024).xlsx")
  #Checked.

```

Now I merge the two datasets
```{r}

cv <- left_join(cv, mil_cv, by = c("divipola", "year"))

### Recruitment

recruit <- read_csv("verdata-reclutamiento-R1.csv")

recruit_reb <- recruit %>%
  rename(divipola = muni_code_hecho,
         year = yy_hecho) %>%
  filter(p_str == "GUE-FARC" | p_str == "GUE-ELN" | p_str == "GUE-OTRO")

dane_ru <- recruit_reb %>%
  group_by(divipola, year) %>%
  summarise(reb_ru = n())

m_data_f3b <- left_join(rcs_data, dane_ru) # The data on recruitment covers the period from 1990 to 2005. Thus, I replace NA by 0 in this period and filter the data since 1990. Before that period, the data can still have NAs

m_data_f3b <- m_data_f3b %>%
  filter(year > 1989)

m_data_f3b <- m_data_f3b %>% replace_na(list(reb_ru = 0)) 

m_data_f3b <- m_data_f3b %>%
  dplyr::select(year, divipola, reb_ru)

cv <- left_join(cv, m_data_f3b)

cv <- cv %>%
  mutate(reb_ru2 = case_when(
    reb_ru == 0 ~ 0,
    reb_ru > 0 ~ 1
  ))

```

I complete the main CV dataset:

```{r}

cv <- cv %>%
  rename(cv5c.1.bd_o = bd_st_rg_omc_up1.2, # Military Dispute
         cv6b.1.ce_o_r = conf_events_omc_r_up1, # Conflict Events // Updated
         cv7b.1.pgf_ck_o_r = pgf_ckill_omc_r_up1, # Civilian Assassinations & Massacres by PGF
         cv7b.2.pgf_cm_o_r = pgf_cmass_omc_r_up1,
         cv7b.3.pgf_cd_o_r = pgf_dv_omc_r_up1)

cv <- cv %>%
  relocate(divipola, year, 
           cv1.lpop, cv2.rur_p, cv3c.c_lsh1, cv4.nbi,
           cv5c.1.bd_o, cv6b.1.ce_o_r, cv7b.1.pgf_ck_o_r, cv7b.2.pgf_cm_o_r, cv7b.3.pgf_cd_o_r, 
           reb_ru, reb_ru2, 
           median_c_lsh_m, median_c_lsh_c, median_c_lsh_b, median_c_lsh_e)

cv3 <- cv %>%
  select(divipola, year, 
           cv1.lpop, cv2.rur_p, cv3c.c_lsh1, cv4.nbi,
           cv5c.1.bd_o, cv6b.1.ce_o_r, cv7b.1.pgf_ck_o_r, cv7b.2.pgf_cm_o_r, cv7b.3.pgf_cd_o_r, 
           reb_ru, reb_ru2, 
           median_c_lsh_m, median_c_lsh_c, median_c_lsh_b, median_c_lsh_e)

write_xlsx(cv, "Ortega(2024)-CV_FV-Updated(09282024).xlsx")
  # Checked
```

A brief note on the name of the files:

I will keep the original names of the raw datasets used for the analysis. Whenever I have intervened or modified a dataset (e.g., by creating a dummy out of a numeric variable), I will name the dataset as, "Ortega(2024)...".